/* Purpose of this execution file is to prepare analysis of entrepreneurs. 

The selection criteria are described in the paper.

Input files:
owners_first_yr: a complete list of initial owners for a new firm. Derived from owners't'.dta
rskap0616_sel_`t'_ut: yearly accounting files
Bransjeinfo_0616.dta: panel at year-firm level with industry variables
Foretaksinfo_0616.dta: panel at year-firm level with foundation year, number of employees
adssb`t': yearly sociodemographic information for all Norwegian individuals
family.dta: file with blood relationships
excess_transition_adapted  - industry info from other dofile

Output files:
StartupPanel
EneurSocioPanel - yearly panel with sociodemographic info for entrepreneurs
EneursCrossSec - one obs per entrepreneur
parents_socio - yearly panel with sociodemographic information about parents of entrepreneurs
hans_file2 - startup panel with socio info for entrepreneurs and parents


Key variables:
pid: individual unique identifier for founder (personal identification number)
fpid: iindividual unique identifier for fathers
stiftetorgnr: unique identifier for firms
faar: first accounting year for new firm (we use this to calculate firm age etc)
stiftaar: year of foundation
p_ variables: socio for father
equity_4 (and other similar): equity in year 4
edut89: education level
pearn: pension earning income (labor income)
bruttoform: gross wealth
mstat: marital status
bransjek variables: industry code 
salgsinn: sales in kroner
yr: calendar year */

******************
** 0. Globals  ***
******************
global svars "pid stiftetorgnr share faar naceSSB equity_0 sales_4 survive4 assets_4 employees_4 roa_4 ebitda_4 aarsrs_4 tech hi_tech"
global pvars "pid rel relpid p_dyear p_nace5 p_nace5_impute p_yob p_plant p_se p_pearn p_yr_pearn p_edut89"
global indvars "pid age eduy sex brform mstat pearn yr nace5 plant iq edut89"

global socio="yob spid wpid fpid wealth_pctile pearn bruttoform sex occ orgnr mstat komres komarb educ styrk doeds capital_income dividend hrs nace isic_Aa"

global adssb PUT YOUR DIRECTORY HERE

global bransjevars "bransjek_02 bransjet_02 bransjek_07 bransjet_07"
global accvars "ak ek aarsrs salgsinn gjeld sumeiend inn_ek ebitda utb rentekost"
global foretakvars "stiftaar ansatte bors_aks" // morsorgnr_lopenr

set matsize 800

*****************************
*** 0. Prelim ***************
*****************************

***************************************************************
*** I. Ownership data - initial sample of founder-startups **** //drop if share<=.33 or faar>2011
***************************************************************
use if stiftaar>=1999 & stiftaar<2004 using owners_first_yrA, clear  //before 2004
tab stiftaar, m
save temp1, replace

use owners_first_yrB, clear  //2004 and onwards
keep if stiftaar!=.
tab stiftaar, m
save temp2, replace

use temp1, clear
append using temp2
drop if pid==""
duplicates report pid stiftetorgnr 

keep if share>.33  //entrepreneurship definition
drop if faar>2011  //we measure what happens to firms after faar+4, so drop newer firms

keep pid stiftetorgnr share faar stiftaar
sort pid
save eneurs_new, replace  //founder-stiftetorgnr events with faar<2011 and share>.33. could have more than one row per pid

*******************************************************************
** II. Startup panel and cross-sec (startup list defined above) ***
*******************************************************************
forvalues t=1999/2015 {
use stiftetorgnr faar using eneurs_new, clear  //this has one row per pid, because selected first startup, but potentially several rows per startup
duplicates drop stiftetorgnr, force
rename stiftetorgnr org_lopenr
sort org_lopenr
merge org_lopenr using rskap0616_sel_`t'_ut
tab _merge
drop _merge
keep org_lopenr faar $accvars

gen aar=`t'

sort org_lopenr aar
merge org_lopenr aar using Bransjeinfo_0616, nokeep keep($bransjevars)
tab _merge
drop _merge

sort org_lopenr aar
merge org_lopenr aar using Foretaksinfo_0616, nokeep keep($foretakvars)
tab _merge
drop _merge 

rename ansatte employees
rename ak sumaksje

** firm size from adssb files, avoid n to m merge
if `t'!=2015 {
preserve
use orgnr employees_SSB using adssb`t', clear
drop if orgnr==""
duplicates drop orgnr, force
sort orgnr
save size, replace
restore

rename org_lopenr orgnr
sort orgnr 
merge orgnr using size, nokeep keep(employees_SSB)  //no of employees at firm-year level
erase size.dta
tab _merge
drop _merge
rename orgnr org_lopenr
}

saveold temp`t'.dta, replace
display in red "`t'"
duplicates report org_lopenr
count if aar==.
}

clear
forvalues t=1999/2015 {
append using temp`t'
erase temp`t'.dta
}
bys aar: count		//number of observations per year, shd be constant

rename org_lopenr stiftetorgnr
rename aar yr

sort stiftetorgnr yr
save StartupPanel, replace  //panel at the firm-year level, where the firms are startups.

****** variables into StartupPanel

* inflation adjustment
use StartupPanel, clear
sort yr
merge yr using CPI1998, nokeep
tab _merge
drop _merge

foreach var of varlist gjeld ebitda sumaksje sumeiend salgsinn aarsrs ek rentekost {
replace `var'=`var'/cpi02
}
drop cpi*

save StartupPanel, replace

** fillin
use StartupPanel, clear

//NB: fake stiftetorgnr to take care of duplicates
egen s=group(stiftetorgnr)
fillin s yr
tab _fillin
drop _fillin
drop if yr<faar
drop s

* activeyear (survival)
gen activeyear=0 if aarsrs==.
replace activeyear=1 if aarsrs!=.
replace activeyear=0 if salgsinn<50 & yr!=faar   /* Demand at least NOK 50K in sales except for first year to be defined as active */
tab activeyear

* create roa
gen ss=sumeiend
replace sumeiend=10 if sumeiend>=0&sumeiend<=10
gen roa=ebitda/sumeiend
replace roa=. if sumeiend<0
drop ss

* winsorize roa
winsor roa, gen(a) p(0.05)
replace roa=a
drop a

** four year average roa
gen firmage=yr-faar
gen rroa=roa if firmage<5
bys stiftetorgnr: egen x=mean(roa)
bys stiftetorgnr: egen roa4=max(x)
drop rroa x

* winsorize aarsrs
winsor aarsrs, gen(a) p(0.05)
replace aarsrs=a
drop a

* survive
forvalues t=0/13 {
gen survive`t'=.
}
forvalues t=0/13 {
replace survive`t'=1 if yr==faar+`t' &activeyear==1
}
forvalues t=0/13 {
replace survive`t'=0 if yr==faar+`t' &activeyear==0
}

forvalues t=0/13 {
bys stiftetorgnr: egen su=max(survive`t')
replace survive`t'=su
drop su
}

local vars "sumeiend activeyear salgsinn employees aarsrs roa ebitda"
local time="0 2 4 6 8 10"
foreach v of local vars {
foreach t of local time {
gen `v'_`t'=`v' if yr==faar+`t' & activeyear==1  //NB
bys stiftetorgnr: egen m=max(`v'_`t')
replace `v'_`t'=m
drop m
}
}

rename bransjek_02 naceSSB
save temp, replace

** Link bransjek_02 and bransjek_07. Find mode bransjek_02 for each bransjek_07 (to account for break in industry code series)
use aar bransjek_02 bransjek_07 using Bransjeinfo_0616, clear
drop if bransjek_02==. | bransjek_07==.
tab aar

bys bransjek_07 bransjek_02: gen n=_N
bys bransjek_07: egen m=max(n)
keep if n==m
duplicates drop bransjek_07, force  //NB: could be more than one bransjek_02 with n==m
rename bransjek_02 bransjek_02_mode
keep bransjek_07 bransjek_02_mode
order bransjek_07
sort bransjek_07
save link_bransje, replace

** bransjek_02 for those with missing (industry link using mode)
use temp, clear
sort bransjek_07
merge bransjek_07 using link_bransje, nokeep keep(bransjek_02_mode)
tab _merge
drop _merge

gen naceSSB_orig=naceSSB
replace naceSSB=bransjek_02_mode if naceSSB==.

duplicates drop stiftetorgnr yr, force

sort stiftetorgnr yr
save StartupPanel, replace

** One obs per startup
use StartupPanel, clear
keep if yr==faar

duplicates drop stiftetorgnr, force

sort stiftetorgnr yr
save StartupFirstyr, replace  //one obs per startup

**********************************
** Throw out eneur events 65-70 **
**********************************
use eneurs_new, clear
joinby stiftetorgnr using StartupFirstyr, unmatched(both)
tab _merge

** drop finance and real estate!
gen nace2=floor(naceSSB/1000)
tab nace2
drop if nace2>64 & nace2<71  //drop financials and real estate
drop nace2

keep pid stiftetorgnr faar share
sort pid stiftetorgnr
save eneurs_new, replace  //this is the final sample of entrepreneurs

***********************************
** III. Entrepreneur socio panel **
***********************************

* Yearly files for socio data
forvalues t=1996/2013 {  //before startup date
use pid using eneurs_new, clear  //one row per pid
duplicates drop pid, force
sort pid
merge pid using adssb`t', nokeep keep($socio)
tab _merge
drop _merge
gen yr=`t'
sort pid
save Stifter`t', replace
}

clear
forvalues t=1996/2013 {
append using Stifter`t'
erase Stifter`t'.dta
}

** Adjustments and extra variables
drop if pid==""
sort pid yr

* 1. Fill in doeds_aar_maaned
gen dyear=substr(doeds_aar_mnd,1,4)
destring dyear, replace
gen dmonth=substr(doeds_aar_mnd,5,.)
destring dmonth, replace

bys pid: egen d=min(dyear)
replace dyear=d
bys pid: egen dd=min(dmonth)
replace dmonth=dd
drop d dd

* 2a. Fix education
sort pid yr
replace educ=educ[_n-1] if pid==pid[_n-1] & educ==.

gsort pid -yr
replace educ=educ[_n-1] if pid==pid[_n-1] & educ==.

* 2b. Fix komres
sort pid yr
replace komres=komres[_n-1] if komres==. & pid==pid[_n-1]

gsort pid -yr
replace komres=komres[_n-1] if komres==. & pid==pid[_n-1]

* 3. Fix gender
bys pid: egen ss=max(sex)
replace sex=ss
drop ss
replace sex=1 if sex==.			// assume male if missing

* 6. Inflation adjustment
sort yr
merge yr using CPI1998, nokeep
tab _merge
drop _merge

foreach var of varlist pearn bruttoform {
replace `var'=`var'/cpi02
}
drop cpi*

* 7. Avgbefore, including self-employed and hrs experience. NB: do NOT count current year (the second 0 in window parenthesis)
sort pid
egen ppid=group(pid)  //to tsset
tsset ppid yr
sort pid yr
sort pid yr
tssmooth ma bruttoformavgbefore=bruttoform, window(5 0 0)

destring occ, replace   //0=Utenfor arbeidsstyrken, 1=Lønnstaker, 2=Selvstendig, 3=Helt Ledig, 4=På arbeidsmarkedstiltak
gen se=1 if occ==2
replace se=0 if occ==1|occ==3|occ==4

sort pid yr
*tssmooth ma se_exp10=se, window(10 0 0)
*tssmooth ma hrs_exp10=hrs, window(10 0 0)
*gen fulltime=hrs==3
*replace fulltime=. if hrs==.
gen orgnr_missing=(orgnr=="")

* 8. marital dummy
gen single=mstat==1
replace single=. if mstat==.

tab mstat
tab single

compress
sort pid yr
save EneurSocioPanel, replace

*****************************************************************************
** IV. Select one startup and keep one row with socio info per individual ***
*****************************************************************************

** Step 1: keep startups where founder between 22 and 55 two years before startup (same as paul new version)
use eneurs_new, clear
joinby pid using EneurSocioPanel, unmatched(both)
tab _merge
drop if _merge!=3
drop _merge

** age restriction
gen age=yr-yob
drop if age>55+(yr-faar+2)
drop if age<22+(yr-faar+2)

** Step 2: select a socio row

** sociodemographic info: keep rows (faar-4, faar-2)
keep if yr==faar-4|yr==faar-3|yr==faar-2
* destring nace, replace //this is work nace

** drop if self-employed that year
*keep if occ==1
drop if occ==2|occ==3|occ==4

** drop if part-time worker in that year
destring hrs, replace
drop if hrs==1|hrs==2

** drop if missing data in that year
drop if pearn==.

drop if pearn==0
* drop if bruttoform==.
*drop if educ==.

** keep year of max income (faar-4,faar-2)
bys pid: egen long m=max(pearn)
keep if pearn==m
drop m
duplicates drop pid, force

** misc variables
gen logbrform=ln(bruttoform+1)
gen logbrformsq=logbrform*logbrform
*gen logequity=ln(equity_0+1)
*gen logequitysq=logequity*logequity
gen logpearn=ln(pearn+1)

gen married=mstat==2

save EneursCrossSec, replace  //this is the list of pid-startups where only remaining exclusing is p_nace!=.

*************************
*** IV. Parental info ***
*************************
use family, clear
keep if rel=="par"  //parents
keep pid relpid
joinby pid using EneursCrossSec
keep pid relpid
sort pid relpid
save eneur_parents, replace

*** parents se
use eneur_parents, clear
rename pid rootpid
rename relpid pid
save temp, replace

*forvalues t=1986(1)2013 {   //old
forvalues t=1996(-1)1986 {   //same as in old data
use temp, clear
sort pid
merge pid using adssb`t', nokeep keep(net_business_income sex yr)
tab _merge
drop _merge
keep if sex==1 //fathers
drop sex
save temp`t', replace
}

clear
forvalues t=1996(-1)1986 {   //same as in old data
append using temp`t'
erase temp`t'.dta
}
replace net_business_income=. if net_business_income==0
bys yr: sum 

gen se_year=0
replace se_year=1 if net_business_income!=.
tab se_year, m
bys pid: egen se=max(se_year)
keep se pid
duplicates drop pid, force
tab se
sort pid
save parents_se, replace

use eneur_parents, clear
rename pid rootpid
rename relpid pid
sort pid
merge pid using parents_se, nokeep keep(se)
tab _merge
drop _merge
sort pid
save eneur_parents, replace

** parental marital status panel
forvalues t=1986/2015 {
use pid rootpid using eneur_parents, clear
sort pid
merge pid using adssb`t', nokeep keep(yr mstat sex spid mpid komres)
tab _merge
keep if sex==1
drop _merge sex
save temp`t', replace
}

clear
forvalues t=1986/2015 {
append using temp`t'
erase temp`t'.dta
}

compress
sort pid yr
save parents_mstat_panel, replace

** parents industry
use eneur_parents, clear

** Loop that gets parent data AND which year data captured in
global vars3 "isic_Aa occ sex yob doeds_aar_mnd orgnr nace"
* global vars3 "isic_Aa occ orgnr nace sex"
foreach v of global vars3 {
gen str `v'=""
gen yr_`v'=.
}

capture destring sex, replace
capture destring yob, replace

foreach v of global vars3 {
display in red "`v'"
forvalues t=1996(-1)1986 {   //1992 first year of nace
sort pid
merge pid using adssb`t', update nokeep keep(`v')
tab _merge
replace yr_`v'=`t' if _merge==4  //year where missing replaced with non-missing
drop _merge
}
}

keep if sex==1 //fathers
destring nace, replace
rename nace p_nace5
rename yob p_yob
rename se p_se
rename orgnr p_orgnr
rename isic_Aa p_isic_Aa
rename occ p_occ

drop yr_sex yr_yob yr_doeds*

gen p_dyear=substr(doeds_aar_mnd,1,4)
destring p_dyear, replace
drop doeds_aar_mnd

keep yr* pid rootpid p_nace5 p_dyear p_yob p_se p_orgnr p_isic_Aa p_occ
duplicates drop pid rootpid, force //redundant
sort rootpid
save parents_socio, replace

** Link isic->nace. Use 1996 where we have both isic and nace codes
use isic nace yr if yr==1996 using individuals_panel, clear
rename isic5 isic
rename nace5 nace
destring isic nace, replace
drop if isic==.
drop if nace==.
duplicates tag isic nace, gen(a)
capture bys isic: tab a
bys isic: egen nace_impute=mode(nace)
duplicates drop isic, force   //gives randomness if more than one mode industry
keep isic nace_impute
sort isic
saveold isic, replace

****** get imputed nace into parents socio file
use parents_socio, clear
rename p_isic_Aa isic
destring isic, replace
sort isic 
merge isic using isic, nokeep keep(nace_impute)
tab _merge
drop _merge

gen p_nace_before_impute=p_nace5
replace p_nace5=nace_impute if p_nace5==.
replace p_nace5=nace_impute if p_nace5==0

gen p_nace2=floor(p_nace5/1000)
tab p_nace2, m

rename nace_impute p_nace_impute
rename isic p_isic

sort rootpid
save parents_socio, replace

************************************************************************
*** VI. Parent and startup info into entrepreneur cross-sectional file *
************************************************************************
use EneursCrossSec, clear  //after age etc restrictions but before dropping nace 65-70 etc.
capture drop _merge
sort stiftetorgnr 
*merge stiftetorgnr using StartupFirstyr  //dangerous n:m merge!!!
joinby stiftetorgnr using StartupFirstyr, unmatched(both)
tab _merge 
drop if _merge!=3
drop _merge
count

//only in using are startups dropped because not first startup, missing socio of founder etc

rename pid rootpid
sort rootpid
joinby rootpid using parents_socio, unmatched(both)  //merge dangerous here because not 1:m
tab _merge
drop _merge pid //pid is father's pid

rename rootpid pid

** father dead
gen age_at_father_death=p_dyear-yob
tab age_at_father_death, m
tab p_dyear, m

gen same_orgnr=0 if orgnr!=""
replace same_orgnr=1 if orgnr==p_orgnr & orgnr!=""
tab same_orgnr, m

duplicates report pid

save hans_file, replace

*********************************************************************
*** VII. Drop if missing nace of father plus create same variables **  //incl eneurs work experience from industry
*********************************************************************
use hans_file, clear
destring nace, replace
global list "naceSSB p_nace5 nace"
foreach v of global list {
gen `v'_2=floor(`v'/1000)
}

** drops
count
drop if p_nace5==.
drop if naceSSB==.

** import industry names
rename p_nace2 Code
sort Code
merge Code using nace_rev_1_1_2digit, nokeep keep(Description Definition)  //names etc for industries
tab _merge
drop _merge
rename Code p_nace2

gen same5_startup=1 if naceSSB==p_nace5 & naceSSB!=.
replace same5_startup=0 if naceSSB!=p_nace5 & naceSSB!=. & p_nace5!=.

gen same5_work=1 if nace==p_nace5 & nace!=.
replace same5_work=0 if nace!=p_nace5 & nace!=. & p_nace5!=.

gen same2_startup=1 if naceSSB_2==p_nace5_2 & naceSSB_2!=.
replace same2_startup=0 if naceSSB_2!=p_nace5_2 & naceSSB_2!=. & p_nace5_2!=.

gen same2_work=1 if nace_2==p_nace5_2 & nace_2!=.
replace same2_work=0 if nace_2!=p_nace5_2 & nace_2!=. & p_nace5_2!=.

tab same5_work, m
tab same5_startup, m
tab same5_work same5_startup, m

tab same2_work, m
tab same2_startup, m
tab same2_work same2_startup, m

** means across father industries
bys p_nace5: egen mean_same5_startup=mean(same5_startup) //not taking into account missing but filling in for rows where missing same2_startup
*bys p_nace5: egen mean_same5_work=mean(same5_work)  //ok because non-missing for everybody
bys p_nace5: gen N5=_N

bys p_nace5_2: egen mean_same2_startup=mean(same2_startup)
*bys p_nace5_2: egen mean_same2_work=mean(same2_work)
bys p_nace5_2: gen N2=_N

gen N=_N

gen logsales4=ln(salgsinn_4+1)
gen logassets4=ln(sumeiend_4+1)
gen logemployees4=ln(employees_4+1)
gen logequity=ln(sumaksje)
gen logequitysq=logequity^2

gen f_dead=p_dyear!=.
replace f_dead=0 if p_dyear<faar
tab f_dead, m

** relatedness
sort p_nace5_2 naceSSB_2
merge p_nace5_2 naceSSB_2 using excess_transition_adapted, nokeep keep(f1 f2 f3 *xtile*)
tab _merge
drop _merge

gen frac_decile10=r3_xtile==10
replace frac_decile10=. if r3_xtile==.

save hans_file2, replace

use yr firmage employees stiftetorgnr using StartupPanel, clear
keep if firmage==0|firmage==1
drop if employees==.
duplicates drop stiftetorgnr, force
bys stiftetorgnr: egen max_empl=max(employees)
tab max_empl, m
keep stiftetorgnr max_empl
sort stiftetorgnr
save max_empl, replace

use hans_file2, clear
sort stiftetorgnr
merge stiftetorgnr using max_empl, nokeep keep(max_empl)
tab _merge
drop _merge

replace employees_4=0 if employees_4==.
gen dummy=logbrform==.
replace logbrform=0 if logbrform==.
replace roa4=. if survive4==0

label var share "Share of Venture Owned"
label var p_se "Father Self-Employed"
label var age "Age"
label var educ "Education, Level"
label var sex "Female"
label var married "Married"
label var same2_work "Worker in parent's industry"
label var same2_startup "Entrepreneur in parent's industry"
label var bruttoform "Assets in that year (000)"
label var pearn "Highest earnings (000), 1996-1998"
label var logequity "Ln(Startup Equity)"
label var logbrform "Ln(Assets pre-Entrepreneurship)"
label var survive4 "5-year Survival"
label var r3_xtile "Decile of relatedness/10"
label var f3 "Average relatedness"
label var frac_decile10 "Fraction in decile 10"
label var max_empl "max number of employees first two years"

save hans_file2, replace










